{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os \n",
    "SPAM_PATH = os.path.join(\"datasets\",\"spam\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "HAM_DIR = os.path.join(SPAM_PATH,\"easy_ham\")\n",
    "SPAM_DIR = os.path.join(SPAM_PATH,\"spam\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "ham_filenames = [name for name in sorted(os.listdir(HAM_DIR))if len(name)>20]\n",
    "spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR))if len(name)>20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import email\n",
    "import email.policy\n",
    "def load_email(is_spam,filename,spam_path=SPAM_PATH):\n",
    "    directory=\"spam\"if is_spam else\"easy_ham\"\n",
    "    with open(os.path.join(spam_path,directory,filename),\"rb\")as f:\n",
    "        return email.parser.BytesParser(policy=email.policy.default).parse(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Martin A posted:\n",
      "Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n",
      " limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n",
      " Mount Athos monastic community, was ideal for the patriotic sculpture. \n",
      " \n",
      " As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n",
      " museum, a restored amphitheatre and car park for admiring crowds are\n",
      "planned\n",
      "---------------------\n",
      "So is this mountain limestone or granite?\n",
      "If it's limestone, it'll weather pretty fast.\n",
      "\n",
      "------------------------ Yahoo! Groups Sponsor ---------------------~-->\n",
      "4 DVDs Free +s&p Join Now\n",
      "http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n",
      "---------------------------------------------------------------------~->\n",
      "\n",
      "To unsubscribe from this group, send an email to:\n",
      "forteana-unsubscribe@egroups.com\n",
      "\n",
      " \n",
      "\n",
      "Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/\n"
     ]
    }
   ],
   "source": [
    "ham_emails = [load_email(is_spam=False,filename=name)for name in ham_filenames]\n",
    "spam_emails = [load_email(is_spam=True,filename=name)for name in spam_filenames]\n",
    "print(ham_emails[1].get_content().strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_email_structure(email):\n",
    "    if isinstance(email,str):\n",
    "        return email\n",
    "    payload = email.get_payload()\n",
    "    if isinstance(payload,list):\n",
    "        return \"multipart({})\".format(\",\".join([\n",
    "            get_email_structure(sub_email)\n",
    "            for sub_email in payload\n",
    "        ]))\n",
    "    else:\n",
    "        return email.get_content_type()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({1: 2, 4: 3, 2: 3, 3: 2})"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "a = [1,4,2,3,2,3,4,2,1,4]\n",
    "b=Counter(a)\n",
    "b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def structures_counter(emails):\n",
    "    structures=Counter()\n",
    "    for email in emails:\n",
    "        structure=get_e(email)\n",
    "        structures[structure]+=1\n",
    "    return structures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 2408),\n",
       " ('multipart(text/plain,application/pgp-signature)', 66),\n",
       " ('multipart(text/plain,text/html)', 8),\n",
       " ('multipart(text/plain,text/plain)', 4),\n",
       " ('multipart(text/plain)', 3),\n",
       " ('multipart(text/plain,application/octet-stream)', 2),\n",
       " ('multipart(text/plain,text/enriched)', 1),\n",
       " ('multipart(text/plain,application/ms-tnef,text/plain)', 1),\n",
       " ('multipart(multipart(text/plain,text/plain,text/plain),application/pgp-signature)',\n",
       "  1),\n",
       " ('multipart(text/plain,video/mng)', 1),\n",
       " ('multipart(text/plain,multipart(text/plain))', 1),\n",
       " ('multipart(text/plain,application/x-pkcs7-signature)', 1),\n",
       " ('multipart(text/plain,multipart(text/plain,text/plain),text/rfc822-headers)',\n",
       "  1),\n",
       " ('multipart(text/plain,multipart(text/plain,text/plain),multipart(multipart(text/plain,application/x-pkcs7-signature)))',\n",
       "  1),\n",
       " ('multipart(text/plain,application/x-java-applet)', 1)]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structures_counter(ham_emails).most_common()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Return-Path : <ilug-admin@linux.ie>\n",
      "Delivered-To : zzzz@localhost.spamassassin.taint.org\n",
      "Received : from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id A7FD7454F6\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:27:38 -0400 (EDT)\n",
      "Received : from phobos [127.0.0.1]\tby localhost with IMAP (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:27:38 +0100 (IST)\n",
      "Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MCJiZ06043 for    <zzzz-ilug@jmason.org>; Thu, 22 Aug 2002 13:19:44 +0100\n",
      "Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29323; Thu, 22 Aug 2002 13:18:52 +0100\n",
      "Received : from email.qves.com ([67.104.83.251]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id NAA29282 for <ilug@linux.ie>; Thu,    22 Aug 2002 13:18:37 +0100\n",
      "X-Authentication-Warning : lugh.tuatha.org: Host [67.104.83.251] claimed to    be email.qves.com\n",
      "Received : from qvp0091 ([169.254.6.22]) by email.qves.com with Microsoft    SMTPSVC(5.0.2195.2966); Thu, 22 Aug 2002 06:18:18 -0600\n",
      "From : Slim Down <taylor@s3.serveimage.com>\n",
      "To : ilug@linux.ie\n",
      "Date : Thu, 22 Aug 2002 06:18:18 -0600\n",
      "Message-Id : <59e6301c249d5$ffb7ea20$1606fea9@freeyankeedom.com>\n",
      "MIME-Version : 1.0\n",
      "Content-Type : text/plain; charset=\"iso-8859-1\"\n",
      "Content-Transfer-Encoding : 7bit\n",
      "X-Mailer : Microsoft CDO for Windows 2000\n",
      "Thread-Index : AcJJ1f+3FWdz11AmR6uWbmQN5gGxxw==\n",
      "Content-Class : urn:content-classes:message\n",
      "X-Mimeole : Produced By Microsoft MimeOLE V6.00.2462.0000\n",
      "X-Originalarrivaltime : 22 Aug 2002 12:18:18.0699 (UTC) FILETIME=[FFB949B0:01C249D5]\n",
      "Subject : [ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206\n",
      "Sender : ilug-admin@linux.ie\n",
      "Errors-To : ilug-admin@linux.ie\n",
      "X-Mailman-Version : 1.1\n",
      "Precedence : bulk\n",
      "List-Id : Irish Linux Users' Group <ilug.linux.ie>\n",
      "X-Beenthere : ilug@linux.ie\n"
     ]
    }
   ],
   "source": [
    "for header,value in spam_emails[1].items():\n",
    "    print(header,\":\",value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'FORTUNE 500 COMPANY HIRING, AT HOME REPS.'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_emails[6]['Subject']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "x=np.array(ham_emails+spam_emails)\n",
    "y=np.array([0]*len(ham_emails)+[1]*len(spam_emails))\n",
    "x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re \n",
    "from html import unescape\n",
    "def html_to_plain_text(html):\n",
    "    text=re.sub('<head.*?>.*?</head>','',html,flags=re.M | re.S|re.I)\n",
    "    text=re.sub('<a\\s.*?>','HYPERLINK',text,flags=re.M | re.S|re.I)\n",
    "    text=re.sub('<.*?>','',text,flags=re.M | re.S)\n",
    "    text=re.sub(r'(\\s*\\n)+','\\n',text,flags=re.M | re.S)\n",
    "    return unescape(text)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HTML><HEAD><TITLE></TITLE><META http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content=\"MSHTML 6.00.2713.1100\" name=\"GENERATOR\"></HEAD>\n",
      "<BODY text=\"#000000\" vLink=\"#0033ff\" link=\"#0033ff\" bgColor=\"#CCCC99\"><TABLE borderColor=\"#660000\" cellSpacing=\"0\" cellPadding=\"0\" border=\"0\" width=\"100%\"><TR><TD bgColor=\"#CCCC99\" valign=\"top\" colspan=\"2\" height=\"27\">\n",
      "<font size=\"6\" face=\"Arial, Helvetica, sans-serif\" color=\"#660000\">\n",
      "<b>OTC</b></font></TD></TR><TR><TD height=\"2\" bgcolor=\"#6a694f\">\n",
      "<font size=\"5\" face=\"Times New Roman, Times, serif\" color=\"#FFFFFF\">\n",
      "<b>&nbsp;Newsletter</b></font></TD><TD height=\"2\" bgcolor=\"#6a694f\"><div align=\"right\"><font color=\"#FFFFFF\">\n",
      "<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height=\"25\" colspan=\"2\" bgcolor=\"#CCCC99\"><table width=\"100%\" border=\"0\"  ...\n"
     ]
    }
   ],
   "source": [
    "html_spam_emails = [email for email in x_train[y_train==1]if get_email_structure(email)==\"text/html\"]\n",
    "sample_html_spam=html_spam_emails[7]\n",
    "print(sample_html_spam.get_content().strip()[:1000],\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "OTC\n",
      " Newsletter\n",
      "Discover Tomorrow's Winners \n",
      "For Immediate Release\n",
      "Cal-Bay (Stock Symbol: CBYI)\n",
      "Watch for analyst \"Strong Buy Recommendations\" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.\n",
      "Put CBYI on your watch list, acquire a position TODAY.\n",
      "REASONS TO INVEST IN CBYI\n",
      "A profitable company and is on track to beat ALL earnings estimates!\n",
      "One of the FASTEST growing distributors in environmental & safety equipment instruments.\n",
      "Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.\n",
      "RAPIDLY GROWING INDUSTRY\n",
      "Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billi ...\n"
     ]
    }
   ],
   "source": [
    "print(html_to_plain_text(sample_html_spam.get_content())[:1000],\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "def email_to_text(email):\n",
    "    html=None\n",
    "    for part in email.walk():\n",
    "        ctype = part.get_content_type()\n",
    "        if not ctype in (\"text/plain\",\"text/html\"):\n",
    "            continue\n",
    "        try:\n",
    "            content=part.get_content()\n",
    "        except:\n",
    "            content=str(part.get_payload())\n",
    "        if ctype==\"text/plain\":\n",
    "            return content\n",
    "        else:\n",
    "            html = content\n",
    "    if html:\n",
    "        return html_to_plain_text(html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "OTC\n",
      " Newsletter\n",
      "Discover Tomorrow's Winners \n",
      "For Immediate Release\n",
      "Cal-Bay (Stock Symbol: CBYI)\n",
      "Wat ...\n"
     ]
    }
   ],
   "source": [
    "print(email_to_text(sample_html_spam)[:100],\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computations => comput\n",
      "Computation => comput\n",
      "Computing => comput\n",
      "Computed => comput\n",
      "Compute => comput\n",
      "Compulsive => compuls\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from urlextract import URLExtract\n",
    "\n",
    "try:\n",
    "    import nltk\n",
    "\n",
    "    stemmer = nltk.PorterStemmer()\n",
    "    for word in (\"Computations\", \"Computation\", \"Computing\", \"Computed\", \"Compute\", \"Compulsive\"):\n",
    "        print(word, \"=>\", stemmer.stem(word))\n",
    "except ImportError:\n",
    "    print(\"Error: stemming requires the NLTK module.\")\n",
    "    stemmer = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 将所有处理整合到一个转换器中，我们将使用电子邮件转换为文字计数器我们使用split（）的方法将句子拆分成单词"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator,TransformerMixin\n",
    "class EmailToWordCounterTransformer(BaseEstimator,TransformerMixin):\n",
    "    def __init__(self,strip_headers=True,lower_case=True,remove_punctuation=True,replace_urls=True,replace_numbers=True\n",
    "                ,stemming=True):\n",
    "        self.strip_headers=strip_headers\n",
    "        self.lower_case=lower_case\n",
    "        self.remove_punctuation = remove_punctuation\n",
    "        self.replace_urls=replace_urls\n",
    "        self.replace_numbers=replace_numbers\n",
    "        self.stemming = stemming\n",
    "    def fit(self,x,y=None):\n",
    "        return self\n",
    "    def transform(self,x,y=None):\n",
    "        x_transformed =[]\n",
    "        for email in x:\n",
    "            text=email_to_text(email) or \"\"\n",
    "            if self.replace_urls:\n",
    "                extractor = URLExtract()\n",
    "                urls = list(set(extractor.find_urls(text)))\n",
    "                urls.sort(key=lambda url:len(url),reverse=True)\n",
    "                for url in urls :\n",
    "                    text =text.replace(url,\"URL\")\n",
    "            if self.replace_numbers:\n",
    "                text=re.sub(r'\\d+(?:\\.\\d*(?:[eE]\\d))?','NUMBER',text)\n",
    "            if self.remove_punctuation:\n",
    "                text=re.sub(r'\\w',' ',text,flags=re.M)\n",
    "            word_counts = Counter(text.split())\n",
    "            if self.stemming and stemmer is not None:\n",
    "                stemmed_word_counts =Counter()\n",
    "                for word,count in word_counts.items():\n",
    "                    stemmed_word = stemmer.stem(word)\n",
    "                    stemmed_word_counts[stemmed_word]+=count\n",
    "                word_counts = stemmed_word_counts\n",
    "            x_transformed.append(word_counts)\n",
    "        return np.array(x_transformed)\n",
    "                "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 测试转换器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([Counter({':': 1, '>[...': 1, '...]': 1, '.': 1}),\n",
       "       Counter({',': 10, '.': 5, '...': 2, ':': 2, '\"': 2, '.\"': 2, '?': 1, ';': 1, '...(': 1, ')': 1}),\n",
       "       Counter({'>': 5, ',': 5, '@': 3, ':': 3, '\"': 2, \"'\": 2, '!': 2, '---': 1, '...,': 1, '<': 1, '...>': 1, '....': 1, '------------------------': 1, '---------------------~-->': 1, '+': 1, '&': 1, '---------------------------------------------------------------------~->': 1, '-': 1, '.': 1})],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_few = x_train[:3]\n",
    "x_few_wordcounts = EmailToWordCounterTransformer().fit_transform(x_few)\n",
    "x_few_wordcounts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
